In this notebook, we are using the tmb_genomic.tsv file
generated from the 01-preprocess-data.Rmd script.
suppressPackageStartupMessages({
library(tidyverse)
library(scales)
})
# Detect the \.git\ folder. This will be in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(\.git\))
scratch_dir <- file.path(root_dir, \scratch\)
analysis_dir <- file.path(root_dir, \analyses\, \tmb-vaf-longitudinal\)
input_dir <- file.path(analysis_dir, \input\)
# File path to results directory
results_dir <-
file.path(analysis_dir, \results\)
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
# Input files
tmb_genomic_file <- file.path(scratch_dir, \tmb_vaf_genomic.tsv\)
palette_file <- file.path(root_dir, \figures\, \palettes\, \tumor_descriptor_color_palette.tsv\)
# File path to plots directory
plots_dir <-
file.path(analysis_dir, \plots\)
if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}
# File path to dumbbell plots directory
dumbbell_plots_dir <-
file.path(plots_dir, \dumbbell\)
if (!dir.exists(dumbbell_plots_dir )) {
dir.create(dumbbell_plots_dir )
}
source(paste0(analysis_dir, \/util/function-create-barplot.R\))
source(paste0(analysis_dir, \/util/function-create-dumbbell-plot.R\))
source(paste0(root_dir, \/figures/scripts/theme.R\))
# Read and process tmb_genomic file
df_total <- readr::read_tsv(tmb_genomic_file, guess_max = 100000, show_col_types = FALSE) %>%
group_by(Kids_First_Participant_ID) %>%
mutate(cg_distinct = n_distinct(cancer_group) > 1) # to identify samples with different diagnosis across timepoints
# Are there any samples with both WGS and WXS?
df_total %>%
unique() %>%
arrange(Kids_First_Participant_ID, experimental_strategy) %>%
group_by(Kids_First_Participant_ID) %>%
dplyr::summarise(experimental_strategy_sum = str_c(experimental_strategy, collapse = \;\))
# There are, so let's remove these from downstream analyses.
df <- df_total %>%
filter(!experimental_strategy == \WXS\) %>%
dplyr::mutate(patient_id = paste(short_histology, Kids_First_Participant_ID, sep = \_\)) %>%
distinct(cancer_group, .keep_all = TRUE) %>%
summarise(cg_sum = str_c(cancer_group, collapse = \
# A tibble: 38 × 2
cg_sum n
<chr> <int>
1 High-grade glioma 107604
2 Low-grade glioma 38088
3 Atypical Teratoid Rhabdoid Tumor 27090
4 Sarcoma,Rosai-Dorfman disease 19705
5 Medulloblastoma 11943
6 Diffuse midline glioma 9944
7 Medulloblastoma,Meningioma 4331
8 CNS Embryonal tumor 1800
9 Ganglioglioma 1312
10 Ewing sarcoma 1106
# ℹ 28 more rows
# Let's summarize cancer groups with < 10 bs_samples as Other and use this for visualization purposes
cg_sum_df <- df %>%
count(cg_sum) %>%
dplyr::mutate(cg_sum_n = glue::glue(\{cg_sum} (N={n})\))
df <- df %>%
left_join(cg_sum_df, by = c(\cg_sum\)) %>%
mutate(cg_plot = case_when(n < 10 ~ \Other\,
TRUE ~ cg_sum),
cg_kids_id = paste(cg_sum, Kids_First_Participant_ID, sep = \_\))
# How many bs_samples per cg_plot?
print(df %>% count(cg_plot) %>% arrange(desc(n)))
# A tibble: 38 × 2
cg_plot n
<chr> <int>
1 High-grade glioma 107604
2 Low-grade glioma 38088
3 Atypical Teratoid Rhabdoid Tumor 27090
4 Sarcoma,Rosai-Dorfman disease 19705
5 Medulloblastoma 11943
6 Diffuse midline glioma 9944
7 Medulloblastoma,Meningioma 4331
8 CNS Embryonal tumor 1800
9 Ganglioglioma 1312
10 Ewing sarcoma 1106
# ℹ 28 more rows
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE)
# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names
# length(unique(df$Kids_First_Participant_ID))
We will explore TMB per Kids_First_Participant_ID over
time by creating stacked barplots.
# Define parameters for function
ylim = max(df$tmb)
x_value <- df$Kids_First_Participant_ID
# Re-order df
f <- c(\Second Malignancy\, \Unavailable\, \Deceased\, \Recurrence\, \Progressive\, \Diagnosis\) # Level df by timepoints
df_plot <- df %>%
dplyr:::mutate(tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, f))
# Run function
fname <- paste0(plots_dir, \/\, \TMB-genomic-total.pdf\)
print(fname)
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-total.pdf\
p <- create_stacked_barplot(tmb_df = df_plot, ylim = ylim, x = x_value, palette = palette)
pdf(file = fname, width = 22, height = 6)
print(p)
dev.off()
png
2
Attention: Hypermutant TMB defined as ≥10 Mb, and Ultrahypermutant TMB defined as ≥100 mutations/Mb in pediatric brain tumors (https://pubmed.ncbi.nlm.nih.gov/29056344/).
Here, we notice that there are samples with high TMB (hyper-mutant samples). Next, we will exclude these samples (threshold >= 10) from downstream analysis. Attention is needed in cases with high number of mutations in only one timepoint as this will lead to un-matched longitudinal samples. We will also remove those so we always have matched longitudinal samples.
# Filter df and remove any samples with single timepoints
df_plot_filter <- df %>%
filter(!tmb >= 10) %>%
unique() %>%
arrange(Kids_First_Participant_ID, tumor_descriptor) %>%
group_by(Kids_First_Participant_ID) %>%
dplyr::summarise(tumor_descriptor_sum2 = str_c(tumor_descriptor, collapse = \;\)) %>%
dplyr::filter(!tumor_descriptor_sum2 %in% c(\Diagnosis\, \Progressive\, \Recurrence\, \Second Malignancy\, \Unavailable\, \Deceased\, \Diagnosis;Diagnosis\,\Progressive;Progressive\)) %>%
left_join(df, by = c(\Kids_First_Participant_ID\)) %>%
drop_na(tmb) %>%
mutate(cg_plot = str_replace(cg_plot, c(\/\), \ \),
tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, f)) %>%
arrange(tumor_descriptor)
# length(unique(df_plot_filter$Kids_First_Participant_ID))
# Define parameters for function
ylim <- max(df_plot_filter$tmb)
df_plot_filter <- df_plot_filter
x_value <- df_plot_filter$cg_kids_id
# Run function
fname <- paste0(plots_dir, \/\, \TMB-genomic-no-hypermutants.pdf\)
print(fname)
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-no-hypermutants.pdf\
p <- create_stacked_barplot(tmb_df = df_plot_filter, ylim = ylim, x = x_value, palette = palette)
pdf(file = fname, width = 25, height = 10)
print(p)
dev.off()
png
2
We will explore TMB per cancer group over time by creating dumbbell plots. We classified by using cancer types with the highest number of samples (High- and Low-grade gliomas) versus any other cancer groups.
# How many bs_samples per kids_id and cancer group?
# print(table(df_plot_filter$cg_plot))
print(df_plot_filter %>%
count(cg_plot, Kids_First_Participant_ID))
# A tibble: 117 × 3
cg_plot Kids_First_Participa…¹ n
<chr> <chr> <int>
1 Adamantinomatous Craniopharyngioma PT_CBTW4E3X 18
2 Adamantinomatous Craniopharyngioma PT_WYXTEG3E 46
3 Adamantinomatous Craniopharyngioma PT_YK7AD0KK 38
4 Adamantinomatous Craniopharyngioma,Craniopharyn… PT_WWZWD4KC 20
5 Adamantinomatous Craniopharyngioma,Meningioma PT_T2M1338J 47
6 Atypical Teratoid Rhabdoid Tumor PT_0WQFCZ6S 558
7 Atypical Teratoid Rhabdoid Tumor PT_3KM9W8S8 1035
8 Atypical Teratoid Rhabdoid Tumor PT_6N825561 20007
9 Atypical Teratoid Rhabdoid Tumor PT_DVXE38EX 1782
10 Atypical Teratoid Rhabdoid Tumor PT_ESHACWF6 1431
# ℹ 107 more rows
# ℹ abbreviated name: ¹​Kids_First_Participant_ID
# Dumbbell plot per cancer group
cancer_groups <- unique(as.character(df_plot_filter$cg_plot))
cancer_groups <- sort(cancer_groups, decreasing = FALSE)
print(cancer_groups)
[1] \Adamantinomatous Craniopharyngioma\
[2] \Adamantinomatous Craniopharyngioma
for (i in seq_along(cancer_groups)) {
print(i)
df_ct_sub <- df_plot_filter %>%
filter(cg_plot == cancer_groups [i])
if (i %in% c(3, 7, 8)) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 2
} else if (i == 2) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 6
} else {
print(cancer_groups [i])
# Define parameters for function
ylim <- 4
}
# Name plots
fname <- paste0(dumbbell_plots_dir, \/\, cancer_groups[i], \-TMB-dumbbell\, \.pdf\)
print(fname)
# Run function
p <- create_dumbbell_ct(tmb_df = df_ct_sub,
ylim = ylim,
ct_id = cancer_groups[i],
palette = palette)
pdf(file = fname, width = 12, height = 8)
print(p)
dev.off()
}
[1] 1
[1] \Adamantinomatous Craniopharyngioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Adamantinomatous Craniopharyngioma-TMB-dumbbell.pdf\
[1] 2
[1] \Adamantinomatous Craniopharyngioma
[1] 3
[1] \Adamantinomatous Craniopharyngioma
[1] 4
[1] \Atypical Teratoid Rhabdoid Tumor\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Atypical Teratoid Rhabdoid Tumor-TMB-dumbbell.pdf\
[1] 5
[1] \Chordoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Chordoma-TMB-dumbbell.pdf\
[1] 6
[1] \Choroid plexus carcinoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Choroid plexus carcinoma-TMB-dumbbell.pdf\
[1] 7
[1] \CNS Embryonal tumor\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/CNS Embryonal tumor-TMB-dumbbell.pdf\
[1] 8
[1] \CNS Embryonal tumor
[1] 9
[1] \CNS Embryonal tumor
[1] 10
[1] \Diffuse leptomeningeal glioneuronal tumor
[1] 11
[1] \Diffuse midline glioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Diffuse midline glioma-TMB-dumbbell.pdf\
[1] 12
[1] \Dysembryoplastic neuroepithelial tumor\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Dysembryoplastic neuroepithelial tumor-TMB-dumbbell.pdf\
[1] 13
[1] \Dysembryoplastic neuroepithelial tumor
[1] 14
[1] \Ependymoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Ependymoma-TMB-dumbbell.pdf\
[1] 15
[1] \Ewing sarcoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Ewing sarcoma-TMB-dumbbell.pdf\
[1] 16
[1] \Ganglioglioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Ganglioglioma-TMB-dumbbell.pdf\
[1] 17
[1] \Ganglioglioma
[1] 18
[1] \Ganglioglioma
[1] 19
[1] \Hemangioblastoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Hemangioblastoma-TMB-dumbbell.pdf\
[1] 20
[1] \High-grade glioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/High-grade glioma-TMB-dumbbell.pdf\
[1] 21
[1] \Low-grade glioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Low-grade glioma-TMB-dumbbell.pdf\
[1] 22
[1] \Low-grade glioma
[1] 23
[1] \Low-grade glioma
[1] 24
[1] \Low-grade glioma
[1] 25
[1] \Medulloblastoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Medulloblastoma-TMB-dumbbell.pdf\
[1] 26
[1] \Medulloblastoma
[1] 27
[1] \Meningioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Meningioma-TMB-dumbbell.pdf\
[1] 28
[1] \Neuroblastoma
[1] 29
[1] \Neurofibroma Plexiform\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Neurofibroma Plexiform-TMB-dumbbell.pdf\
[1] 30
[1] \Neurofibroma Plexiform
[1] 31
[1] \Pilocytic astrocytoma
[1] 32
[1] \Schwannoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/dumbbell/Schwannoma-TMB-dumbbell.pdf\
Here, we want to explore the number of mutations per timepoint and biospecimen sample per patient case.
samples <- unique(as.character(df_plot_filter$Kids_First_Participant_ID))
print(samples)
[1] \PT_2ECVKTTQ\ \PT_82MX6J77\ \PT_98QMQZY7\ \PT_9PJR0ZK7\ \PT_T2M1338J\
[6] \PT_WWRB8KDQ\ \PT_19GCSK2S\ \PT_1H2REHT2\ \PT_23NZGSRJ\ \PT_37B5JRP1\
[11] \PT_394ZA6P7\ \PT_3AR6AW9N\ \PT_3KM9W8S8\ \PT_5NS35B66\ \PT_6N825561\
[16] \PT_75HRTX4S\ \PT_8GN3TQRM\ \PT_CWVNNBPH\ \PT_CXT81GRM\ \PT_DNAJYFZT\
[21] \PT_DR94DMTG\ \PT_DVXE38EX\ \PT_FA2F3HQG\ \PT_FN57KS79\ \PT_FWWRWTV2\
[26] \PT_GTHZF21E\ \PT_HFQNKP5X\ \PT_HJMP6PH2\ \PT_KBFM551M\ \PT_KTRJ8TFY\
[31] \PT_KZ56XHJT\ \PT_MDWPRDBT\ \PT_MNSEJCDM\ \PT_NJQ26FHN\ \PT_NZ85YSJ1\
[36] \PT_PR4YBBH3\ \PT_RJ1TJ2KH\ \PT_SD4RJ57T\ \PT_WYXTEG3E\ \PT_YND59052\
[41] \PT_Z4BF2NSB\ \PT_00G007DM\ \PT_0DWRY9ZX\ \PT_0WQFCZ6S\ \PT_2MZPGZN1\
[46] \PT_2YT37G8P\ \PT_3T3VGWC6\ \PT_7M2PGCBV\ \PT_89XRZBSG\ \PT_962TCBVR\
[51] \PT_99S5BPE3\ \PT_BRVGRXQY\ \PT_BZCJMEX8\ \PT_C1RDBCVM\ \PT_CBTW4E3X\
[56] \PT_CWXSP19D\ \PT_D6AJHDST\ \PT_DFQAH7RS\ \PT_EQX0VT4F\ \PT_HHG37M6W\
[61] \PT_HXV713W6\ \PT_JP1FDKN9\ \PT_KMHGNCNR\ \PT_N8W26H19\ \PT_NPETR8RY\
[66] \PT_PF04R0BH\ \PT_PFA762TK\ \PT_QH9H491G\ \PT_S2SQJVGK\ \PT_T4VN7ZRB\
[71] \PT_TP6GS00H\ \PT_W6AWJJK7\ \PT_XZGWKXC5\ \PT_YK7AD0KK\ \PT_Z4GS3ZQQ\
[76] \PT_ZZRBX5JT\ \PT_02J5CWN5\ \PT_04V47WFC\ \PT_1ZAWNGWT\ \PT_25Z2NX27\
[81] \PT_2FVTD0WR\ \PT_39H4JN6H\ \PT_3GYW6P6P\ \PT_3P3HARZ2\ \PT_3R0P995B\
[86] \PT_3VCS1PPF\ \PT_5CYJ3NZ9\ \PT_5ZPPR06P\ \PT_62G82T6Q\ \PT_6S1TFJ3D\
[91] \PT_773ZPTEB\ \PT_7WYPEC3Q\ \PT_9S6WMQ92\ \PT_AQWDQW27\ \PT_AV0W0V8D\
[96] \PT_B5DQ8FF0\ \PT_ESHACWF6\ \PT_FN4GEEFR\ \PT_HE8FBFNA\ \PT_JNEV57VK\
[101] \PT_JSFBMK5V\ \PT_K8ZV7APT\ \PT_NK8A49X5\ \PT_P571HTNK\ \PT_PAPEQ0T0\
[106] \PT_QJDY4Y9P\ \PT_S4YNE17X\ \PT_TKWTTRQ7\ \PT_TRZ1N1HQ\ \PT_VTG1S395\
[111] \PT_WP871F5S\ \PT_WWZWD4KC\ \PT_XA98HG1C\ \PT_XTVQB9S4\ \PT_Y98Q8XKV\
[116] \PT_YGN06RPZ\ \PT_ZMKMKCFQ\
for (i in seq_along(samples)) {
print(i)
tmb_sub <- df_plot_filter %>%
filter(Kids_First_Participant_ID == samples[i])
# Define parameters for function
ylim = max(df_plot_filter$mutation_count)
# Run function
p <- create_barplot_sample(tmb_df = tmb_sub,
ylim = ylim,
sid = samples[i],
palette = palette)
print(p)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
[1] 89
[1] 90
[1] 91
[1] 92
[1] 93
[1] 94
[1] 95
[1] 96
[1] 97
[1] 98
[1] 99
[1] 100
[1] 101
[1] 102
[1] 103
[1] 104
[1] 105
[1] 106
[1] 107
[1] 108
[1] 109
[1] 110
[1] 111
[1] 112
[1] 113
[1] 114
[1] 115
[1] 116
[1] 117
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] ggthemes_4.2.4 scales_1.2.1 lubridate_1.9.2 forcats_1.0.0
[5] stringr_1.5.0 dplyr_1.1.1 purrr_1.0.1 readr_2.1.4
[9] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.0 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] highr_0.10 bslib_0.4.2 compiler_4.2.3 pillar_1.9.0
[5] jquerylib_0.1.4 tools_4.2.3 bit_4.0.5 digest_0.6.31
[9] timechange_0.2.0 jsonlite_1.8.4 evaluate_0.20 lifecycle_1.0.3
[13] gtable_0.3.3 pkgconfig_2.0.3 rlang_1.1.0 cli_3.6.1
[17] parallel_4.2.3 yaml_2.3.7 xfun_0.38 fastmap_1.1.1
[21] withr_2.5.0 knitr_1.42 generics_0.1.3 vctrs_0.6.2
[25] sass_0.4.5 hms_1.1.3 bit64_4.0.5 rprojroot_2.0.3
[29] tidyselect_1.2.0 glue_1.6.2 R6_2.5.1 fansi_1.0.4
[33] vroom_1.6.1 rmarkdown_2.21 farver_2.1.1 tzdb_0.3.0
[37] magrittr_2.0.3 htmltools_0.5.5 colorspace_2.1-0 labeling_0.4.2
[41] utf8_1.2.3 stringi_1.7.12 munsell_0.5.0 cachem_1.0.7
[45] crayon_1.5.2